Data

Which years are missing a metric

dat %>% 
  ggplot(aes(year, lakeid, fill=is.na(daynum))) +
  geom_tile(color="grey") +
  theme_bw() +
  facet_wrap(~metric, nrow=6) 

Remaining data issues:

  • Northern lakes don’t have DOC until until 1986; missing first 4 years when all other variables are available. Potential fixes:
    • Exclude DOC; start analyses in 1986; fill DOC with e.g., median value, or model of other variables?
  • Southern lakes missing chlorophyll data: 1996 - 1999 and 2002 - 2004; 2002-04 is bad/uncalibrated values, used DOY but not magnitude; earlier years fill with median
  • Zoop data issues:
    • CB missing daphnia peak 1995(?): fill with median
    • FI missing several years from 1996 - 2006; fill with median

Predict DOC daynum from other variable daynum’s in each northern lake

no_dups = dat %>% group_by(lakeid, year, metric) %>% summarise(N = n()) %>% filter(N == 1)
## `summarise()` has grouped output by 'lakeid', 'year'. You can override using
## the `.groups` argument.
n_lakes_wide = left_join(no_dups, dat) %>% 
  select(-sampledate) %>% 
  pivot_wider(names_from = metric, values_from = daynum) %>% 
  filter(lakeid %in% c("AL", "BM", "CB", "CR", "SP", "TB", "TR"))
## Joining, by = c("lakeid", "year", "metric")
hold_data = na.omit(data.frame(n_lakes_wide %>%  ungroup() %>% select(-year, -N))) # , -doc_predict
all = lm(doc_epiMax~(iceon+straton+stratoff+energy+
                 stability+anoxia_summer+
                 secchi_openwater_max)*lakeid, data=hold_data)
i_o = lm(doc_epiMax~1, data=hold_data)
hold_step = step(i_o, scope=formula(all))
## Start:  AIC=1761.44
## doc_epiMax ~ 1
## 
##                        Df Sum of Sq    RSS    AIC
## + lakeid                6     74137 395186 1733.7
## + stratoff              1     15586 453737 1755.6
## <none>                              469323 1761.4
## + secchi_openwater_max  1      1334 467989 1762.8
## + iceon                 1       565 468758 1763.2
## + straton               1       551 468772 1763.2
## + anoxia_summer         1       311 469012 1763.3
## + energy                1       305 469018 1763.3
## + stability             1        65 469258 1763.4
## 
## Step:  AIC=1733.72
## doc_epiMax ~ lakeid
## 
##                        Df Sum of Sq    RSS    AIC
## + stratoff              1      6275 388911 1732.0
## <none>                              395186 1733.7
## + anoxia_summer         1      2791 392395 1734.1
## + energy                1      2484 392702 1734.3
## + iceon                 1       595 394591 1735.4
## + straton               1       121 395065 1735.7
## + secchi_openwater_max  1        60 395126 1735.7
## + stability             1        56 395130 1735.7
## - lakeid                6     74137 469323 1761.4
## 
## Step:  AIC=1732.03
## doc_epiMax ~ lakeid + stratoff
## 
##                        Df Sum of Sq    RSS    AIC
## <none>                              388911 1732.0
## + energy                1      3173 385738 1732.1
## + anoxia_summer         1      3055 385856 1732.2
## - stratoff              1      6275 395186 1733.7
## + iceon                 1       367 388544 1733.8
## + stability             1       232 388679 1733.9
## + straton               1       218 388693 1733.9
## + secchi_openwater_max  1        27 388884 1734.0
## + stratoff:lakeid       6     10379 378532 1737.8
## - lakeid                6     64826 453737 1755.6
doc_model = lm(doc_epiMax~(iceon+straton+stratoff+energy+
                 stability+anoxia_summer+
                 secchi_openwater_max)*lakeid, 
               data=n_lakes_wide, 
               na.action = na.exclude)
summary(doc_model)
## 
## Call:
## lm(formula = doc_epiMax ~ (iceon + straton + stratoff + energy + 
##     stability + anoxia_summer + secchi_openwater_max) * lakeid, 
##     data = n_lakes_wide, na.action = na.exclude)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -150.321  -17.754    2.572   22.905  119.066 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)  
## (Intercept)                    6.565e+01  1.691e+02   0.388   0.6983  
## iceon                          2.499e-01  3.647e-01   0.685   0.4940  
## straton                        1.685e-01  3.627e-01   0.465   0.6428  
## stratoff                       5.087e-01  3.117e-01   1.632   0.1044  
## energy                        -1.686e-01  2.244e-01  -0.751   0.4535  
## stability                     -1.210e-01  1.494e-01  -0.810   0.4192  
## anoxia_summer                 -7.801e-02  1.775e-01  -0.440   0.6607  
## secchi_openwater_max          -8.199e-02  8.251e-02  -0.994   0.3217  
## lakeid.L                       2.877e+02  4.488e+02   0.641   0.5223  
## lakeid.Q                       3.595e+02  4.370e+02   0.823   0.4118  
## lakeid.C                       2.718e+02  4.509e+02   0.603   0.5474  
## lakeid^4                       1.705e+02  4.573e+02   0.373   0.7097  
## lakeid^5                      -3.865e+02  4.530e+02  -0.853   0.3948  
## lakeid^6                       3.226e+02  4.371e+02   0.738   0.4615  
## iceon:lakeid.L                -1.070e+00  9.624e-01  -1.112   0.2676  
## iceon:lakeid.Q                -5.617e-01  9.511e-01  -0.591   0.5555  
## iceon:lakeid.C                -4.825e-01  9.578e-01  -0.504   0.6151  
## iceon:lakeid^4                 5.985e-02  1.000e+00   0.060   0.9524  
## iceon:lakeid^5                 3.784e-01  9.531e-01   0.397   0.6918  
## iceon:lakeid^6                -1.010e-01  9.630e-01  -0.105   0.9166  
## straton:lakeid.L              -9.552e-01  9.708e-01  -0.984   0.3265  
## straton:lakeid.Q               1.588e-01  8.818e-01   0.180   0.8573  
## straton:lakeid.C               1.778e-01  9.835e-01   0.181   0.8567  
## straton:lakeid^4               6.986e-01  1.034e+00   0.676   0.5001  
## straton:lakeid^5               1.452e+00  9.960e-01   1.458   0.1467  
## straton:lakeid^6               7.466e-01  8.815e-01   0.847   0.3981  
## stratoff:lakeid.L             -7.548e-02  8.350e-01  -0.090   0.9281  
## stratoff:lakeid.Q             -3.476e-01  7.480e-01  -0.465   0.6427  
## stratoff:lakeid.C             -1.518e-02  8.381e-01  -0.018   0.9856  
## stratoff:lakeid^4             -5.921e-01  9.186e-01  -0.645   0.5200  
## stratoff:lakeid^5              8.062e-01  8.412e-01   0.958   0.3391  
## stratoff:lakeid^6             -1.532e+00  7.542e-01  -2.032   0.0436 *
## energy:lakeid.L                3.454e-01  5.974e-01   0.578   0.5639  
## energy:lakeid.Q               -4.228e-01  5.525e-01  -0.765   0.4452  
## energy:lakeid.C               -1.824e-01  5.978e-01  -0.305   0.7606  
## energy:lakeid^4               -9.815e-02  6.504e-01  -0.151   0.8802  
## energy:lakeid^5               -4.342e-01  5.982e-01  -0.726   0.4689  
## energy:lakeid^6               -5.155e-02  5.604e-01  -0.092   0.9268  
## stability:lakeid.L            -9.498e-02  3.465e-01  -0.274   0.7843  
## stability:lakeid.Q             1.516e-01  3.811e-01   0.398   0.6913  
## stability:lakeid.C            -8.111e-02  3.990e-01  -0.203   0.8392  
## stability:lakeid^4            -3.708e-01  3.482e-01  -1.065   0.2883  
## stability:lakeid^5            -1.637e-01  4.454e-01  -0.368   0.7137  
## stability:lakeid^6            -1.004e-01  4.403e-01  -0.228   0.8198  
## anoxia_summer:lakeid.L         2.287e-01  4.741e-01   0.482   0.6302  
## anoxia_summer:lakeid.Q         1.894e-01  4.752e-01   0.399   0.6907  
## anoxia_summer:lakeid.C        -9.741e-03  4.518e-01  -0.022   0.9828  
## anoxia_summer:lakeid^4        -1.033e-01  5.073e-01  -0.204   0.8389  
## anoxia_summer:lakeid^5        -5.992e-02  4.282e-01  -0.140   0.8889  
## anoxia_summer:lakeid^6         9.392e-02  4.767e-01   0.197   0.8440  
## secchi_openwater_max:lakeid.L  5.057e-01  2.347e-01   2.155   0.0325 *
## secchi_openwater_max:lakeid.Q -4.229e-01  2.348e-01  -1.801   0.0733 .
## secchi_openwater_max:lakeid.C -1.705e-01  2.244e-01  -0.760   0.4483  
## secchi_openwater_max:lakeid^4 -9.056e-02  1.956e-01  -0.463   0.6438  
## secchi_openwater_max:lakeid^5 -1.488e-01  2.137e-01  -0.697   0.4870  
## secchi_openwater_max:lakeid^6  1.677e-01  2.037e-01   0.823   0.4115  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 42.93 on 180 degrees of freedom
##   (37 observations deleted due to missingness)
## Multiple R-squared:  0.2949, Adjusted R-squared:  0.07939 
## F-statistic: 1.368 on 55 and 180 DF,  p-value: 0.06508
n_lakes_wide$doc_predict = predict(doc_model, n_lakes_wide)

cor = n_lakes_wide %>% 
  group_by(lakeid) %>% 
  summarise(r = paste("r =", round(cor(doc_epiMax, doc_predict, use="complete.obs"), 3)))

n_lakes_wide %>% 
  ggplot(aes(x=doc_epiMax, y=doc_predict, color=as.character(lakeid))) +
  geom_point()+
  theme_bw() +
  labs(color="lakeid")+
  geom_abline(slope=1, intercept = 0) + 
  facet_wrap(~lakeid) + 
  geom_text(aes(label=r), data=cor, x=300, y=0) +
  labs(x="obs peak DOC (daynum)",  y="modeled peak DOC (daynum)")
## Warning: Removed 37 rows containing missing values (`geom_point()`).

# not good but vaguely positive relationship; use this instead of median?
missing_doc = n_lakes_wide %>% filter(is.na(doc_epiMax))

missing_doc$doc_fill = round(predict(doc_model, missing_doc))

doc_fill = missing_doc %>% 
  select(lakeid, year, doc_fill) %>% 
  rename(daynum_fill = doc_fill) %>% 
  mutate(metric = "doc_epiMax") %>% 
  filter(year < 2000)

Create filled metric column and filled

all_fill = bind_rows(doc_fill)
dat_filled = full_join(dat, all_fill) %>% 
  mutate(filled_flag = ifelse(is.na(daynum) & !is.na(daynum_fill), TRUE, FALSE)) %>% 
  mutate(daynum_fill = ifelse(is.na(daynum) & !is.na(daynum_fill), daynum_fill, daynum))
## Joining, by = c("lakeid", "metric", "year")
dat_filled$lakeid = factor(dat_filled$lakeid, 
                    levels = c("AL", "BM", "CB", "CR", "SP", "TB", "TR", "FI", "ME", "MO", "WI"), 
                    ordered = T)
vars = c("iceoff", "straton", "secchi_all", "secchi_openwater_max","secchi_openwater_min", "zoopDensity","zoopDensity_CC",
        "doc_epiMax","totpuf_epiMax", "totpuf_epiMin", "totpuf_hypoMax", "totpuf_hypoMin", 
        "anoxia", "anoxia_summer", "stability", "energy", "stratoff", "iceon")
dat_filled$metric = factor(dat_filled$metric,
                    levels = rev(vars),
                    ordered=T)

dat_filled %>% 
  ggplot(aes(year, metric, fill=filled_flag)) +
  geom_tile(color="grey") +
  theme_bw() +
  facet_wrap(~lakeid, nrow=6)

## Additional trimming/filling

Trim to “good” years, fill FI ice data with Monona, then fill additional missing with median values for each lake/metric

df_yearsWant = dat_filled %>% 
  filter((lakeid %in% c("FI", "ME", "MO", "WI") & year %in% 1996:2018) |
           (!(lakeid %in% c("FI", "ME", "MO", "WI")) & year %in% 1982:2018))

fi_icefill = df_yearsWant %>% 
  filter(lakeid == "MO" & metric %in% c("iceoff", "iceon")) %>% 
  mutate(lakeid = "FI") %>% 
  select(-daynum, -filled_flag, -sampledate) %>% 
  rename(icefill = daynum_fill)

df_yearsWant = df_yearsWant %>% 
  full_join(fi_icefill) %>% 
  mutate(daynum_fill = ifelse(is.na(daynum_fill) & !is.na(icefill), 
                              icefill, daynum),
         filled_flag = ifelse(is.na(daynum) & !is.na(daynum_fill) & !is.na(icefill), 
                              TRUE, filled_flag))
## Joining, by = c("lakeid", "metric", "year")
df_yearsWant %>% 
  ggplot(aes(year, metric, fill=filled_flag)) +
  geom_tile(color="grey") +
  theme_bw() +
  facet_wrap(~lakeid, nrow=6)

Final fill w/ medians

all_lys = df_yearsWant %>% 
  select(lakeid, year) %>% 
  distinct()

metric = unique(df_yearsWant$metric)

all_lys_want = expand_grid(all_lys, metric)

dat_final = left_join(all_lys_want, df_yearsWant)
## Joining, by = c("lakeid", "year", "metric")
medians = dat_final %>% 
  group_by(lakeid, metric) %>% 
  summarise(median_daynum = median(daynum_fill, na.rm=T))
## `summarise()` has grouped output by 'lakeid'. You can override using the
## `.groups` argument.
dat_final = dat_final %>% 
  left_join(medians) %>% 
  mutate(daynum_fill = ifelse(is.na(daynum_fill), median_daynum, daynum_fill)) %>% 
  mutate(filled_flag = ifelse(is.na(daynum) & !is.na(daynum_fill), TRUE, filled_flag)) %>% 
  select(-icefill, -median_daynum)
## Joining, by = c("lakeid", "metric")
dat_final %>%  
  ggplot(aes(year, metric, fill=filled_flag)) +
  geom_tile(color="grey") +
  theme_bw() +
  facet_wrap(~lakeid, nrow=6)

write_csv(dat_final, "Data/analysis_ready/final_combined_dates_filled_v2.csv")